--- title: GTEx v7 - Download and Basic library preparation keywords: fastai sidebar: home_sidebar nb_path: "01_GTEx.ipynb" ---
# logger.info('logger info')
# logger.warning('logger.warning')
import os, sys, glob
from logzero import logger
import inspect, urllib
import pandas
import scipy
import numpy as np
from pprint import pprint
import time, datetime
import seaborn as sns
import plotly.express as px
# from chart_studio import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objects as go
# __file__ does not work in a jupyter notebook
# make something equivalent - actually only for this particular cell
import inspect
# __file__ does not work in a jupyter notebook
# make something equivalent - actually only for this particular cell
__file__ = os.path.abspath(inspect.getfile(lambda: None))
import pathlib
script_dir = pathlib.Path().resolve()
# Gene transcripts per million data
# Open a handle onto the GTEx expression data
GTEX_URL = "https://storage.googleapis.com/gtex_analysis_v8"
GTEXV8_TPM = os.path.join(GTEX_URL, "rna_seq_data", "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz")
GTEXV8_TPM_MED = os.path.join(GTEX_URL, "rna_seq_data", "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz")
GTEX_PHENO_DS = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt")
GTEX_PHENO_DD = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDD.xlsx")
GTEX_SAMPLE_DS = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
GTEX_SAMPLE_DD = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx")
from tqdm import tqdm
import requests
cache_dir = os.path.join(script_dir, "data")
if os.path.exists(cache_dir):
logger.info(f"Found {cache_dir}")
else:
os.mkdir(cache_dir)
for url in [GTEXV8_TPM_MED, GTEXV8_TPM, GTEX_PHENO_DS, GTEX_PHENO_DD, GTEX_SAMPLE_DS, GTEX_SAMPLE_DD]:
dest = os.path.join(cache_dir, os.path.basename(url))
if os.path.exists(dest):
logger.info(f"found existing: {dest}")
else:
logger.info(f"Downloading {dest}")
response = requests.get(url, stream=True)
with open(dest, "wb") as fh:
for data in tqdm(response.iter_content()):
fh.write(data)
logger.info(f"Completed {dest}")
GTEXV8_TPM = os.path.join(GTEX_URL, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz")
GTEXV8_TPM_MED = os.path.join(GTEX_URL, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz")
GTEX_PHENO = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt")
GTEX_SAMPLE_DS = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
GTEX_SAMPLE_DD = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx")
# Detailed data file:
gtex_tpm_fn = os.path.join(cache_dir, os.path.basename(GTEXV8_TPM))
# Just looking at median values by tissue (SMTSD) greatly reduces file size
gtex_tpm_med_fn = os.path.join(cache_dir, os.path.basename(GTEXV8_TPM_MED))
# Sample annotation data
# Subject Phenotype data
# Age, sex and Hardy Scale death circumstances
gtex_pheno_fn = os.path.join(cache_dir, os.path.basename(GTEX_PHENO))
# Main sample data of interest is:
# 'SMTS': 'Tissue Type, area from which the tissue sample was taken. This is a parent value to SMTSD.'
# 'SMTSD': 'SMTS Detailed'
gtex_attr_fn = os.path.join(cache_dir, os.path.basename(GTEX_SAMPLE_DS))
gtex_attr_desc_fn = os.path.join(cache_dir, os.path.basename(GTEX_SAMPLE_DD))
# Load tissue specific detatils
gtex_attr = pandas.read_csv(gtex_attr_fn, sep='\t')
tissue_types = sorted(gtex_attr['SMTS'].unique())
tissue_dict = dict()
for tissue in tissue_types:
tissue_dict[tissue] = dict()
tissue_dict[tissue]['samples'] = gtex_attr[gtex_attr['SMTS'] == tissue]['SAMPID'].to_list()
tissue_dict[tissue]['subtypes'] = dict()
for subtype in gtex_attr[gtex_attr['SMTS'] == tissue]['SMTSD'].unique():
# Get all samples by subtype
tissue_dict[tissue]['subtypes'][subtype] = gtex_attr[gtex_attr['SMTSD'] == subtype]['SAMPID'].to_list()
sorted(gtex_attr[gtex_attr['SMTS'] == tissue]['SMTSD'].unique())
tissue_subtypes = sorted(gtex_attr['SMTSD'].unique())
# log number of tissue samples
logger.info('%s separate tissue types - total samples %s', len(tissue_dict), len(gtex_attr['SAMPID']))
for tissue in sorted(tissue_dict.keys()):
logger.info('%s (n=%s)', tissue, len(tissue_dict[tissue]['samples']))
if len(tissue_dict[tissue]['subtypes']) > 1:
for subtype in sorted(tissue_dict[tissue]['subtypes'].keys()):
logger.info('\t%s (n=%s)', subtype, len(tissue_dict[tissue]['subtypes'][subtype]))
# logger.info(pprint.pformat(tissue_dict))
# Other annotation data
# Mutually exclusive
# Which study was this from?
# mut_info_fn = os.path.join(cache_dir, 'enrichments-analysis-result.txt')
#mut_info = pandas.read_csv(mut_info_fn, sep='\t')
# Human housekeeping gene - stable expression
## https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf - Human housekeeping genes, revisited
# https://m.tau.ac.il/~elieis/HKG/HK_genes.txt
# E. Eisenberg and E.Y. Levanon, Trends in Genetics, 29 (2013)
# The full set of all all GTEx tpms is very large and can be hard to work with
# Some functions only load some of the data at a time.
# Genes of interest - some test genes
TEST_GENES = [
'TP53',
'ERBB2', # Herceptin target
'EGFR',
'AKT1',
'KRAS',
'PTEN',
'APOE',
]
TUMOUR_MUT_GENES = [ # www.tumourportal.org - highly mutated
'TP53',
'PIK3CA',
'PTEN',
'KRAS',
'APC',
'MLL3', # aka KMT2C
'KMT2C',
'FAT1',
'MLL2', # aka KMT2D
'KMT2D',
'ARID1A',
'VHL',
'PBRM1',
'NF1',
'EGFR',
'ATM',
'PIK3R1',
'BRAF',
'CDKN2A',
'SETD2',
'CREBBP',
'FBXW7',
'SPEN',
'MTOR',
'RB1',
'SMARCA4',
'NOTCH1',
]
TEST_GENES = sorted(set(TEST_GENES + TUMOUR_MUT_GENES))
# gtex_tpm file has genes as rows.
# First two columns are gene information
# Over 10000 columns of samples
# Name Description GTEX-1117F-0226-SM-5GZZ7 GTEX-111CU-1826-SM-5GZYN ...
#0 ENSG00000223972.4 DDX11L1 0.10820 0.11580
#1 ENSG00000227232.4 WASH7P 21.40000 11.03000
GTPM_SKIP = 2 # skip two comment rows at the beginning of the file
# Just getting a few rows of this giant file
logger.info('Loading Row partial table ...')
gtpm_row_table = pandas.read_csv(gtex_tpm_fn, skiprows=2, sep='\t', usecols=range(4))
logger.info('Loading column partial table ...')
gtpm_column_table = pandas.read_csv(gtex_tpm_fn, skiprows=lambda x: x not in [2, 3], sep='\t')
def gtpm_row_by_genename(gene_name):
sub_table = gtpm_row_table[gtpm_row_table['Description'] == gene_name]
if sub_table.empty:
logger.error("Missing '%s row in GTEX TPM table'", gene_name)
return None
return [GTPM_SKIP + 1 + i for i in sub_table.index][0]
def gtpm_keep_rows(gene_list=TEST_GENES):
row_list = [gtpm_row_by_genename(gene) for gene in gene_list]
return [GTPM_SKIP] + [gene for gene in row_list if gene]
def gtex_tpm_partial_table_load(gene_list, sample_list=None):
start_time = time.time()
if gene_list:
keep_rows = gtpm_keep_rows(gene_list)
skiprows_func = lambda x: x not in keep_rows
logger.info('loading %s GTEX TPM rows', len(keep_rows))
else:
logger.info('loading all rows')
skiprows_func = lambda x: x in [0, 1]
if sample_list:
logger.info('Limiting load to %s potential samples', len(sample_list))
# Not all samples are in tables
sample_list = [s for s in sample_list if s in gtpm_column_table.columns]
logger.info('Limiting load to %s samples', len(sample_list))
columns = ['Name', 'Description'] + sample_list
else:
sample_list = None
gtpm_partial = pandas.read_csv(
gtex_tpm_fn,
skiprows=skiprows_func, # Only selected gene rows
sep='\t',
header=0,
usecols=sample_list, # Number of samples limit
)
end_time = time.time()
logger.info("Gtex TPM partial table took {} minutes".format((end_time - start_time)/60))
return gtpm_partial
logger.info(f"loading median GTEx tissue TPM values: {gtex_tpm_med_fn}")
gtpm_med = pandas.read_csv(gtex_tpm_med_fn, sep='\t', skiprows=2, low_memory=False)
# gtpm_breast = gtex_tpm_partial_table_load(gene_list=TEST_GENES, sample_list=tissue_dict['Breast']['samples'])
logger.info(f"loading GTEX TPMs for {len(TEST_GENES)} genes")
logger.info(f" GTEX GENES {TEST_GENES}")
gtpm = gtex_tpm_partial_table_load(gene_list=TEST_GENES,)
# Remove missing genes from the test
TEST_GENES = gtpm['Description'].to_list()
The smaller table of median TPM values can give us a good idea of the overall character of the different genes.
The spread includes extreme values, most of the tissues are similar.
Mean is ~ 16 - 17 TPM, but 50th percentile is 0 and 75th percentile is ~ 2.
This indicates that the TPM is not a normal distribution, but has a small number of extreme values.
# Do something with the median values
rename_cols = dict()
for t in tissue_dict.keys():
# pprint(tissue_dict[t]['subtypes'].keys())
# pick a single representative tissue subtype
subtype = sorted(tissue_dict[t]['subtypes'].keys())[-1]
if subtype != 'Cells - Leukemia cell line (CML)':
rename_cols[subtype] = t
sample_columns = tuple(t
for t in tissue_dict.keys()
)
# gm = gtpm_med.drop('gene_id', axis=1).set_index('Description')
gm = gtpm_med.set_index('Description')
gm.rename(columns=rename_cols, inplace=True)
gm = gm[sorted(rename_cols.values())]
# Copy table so we can add and modify values
gm = gm.copy()
# Using gm table we can chacterize the genes a bit.
# the spread includes extreme values, most of the tissues are similar
# Mean is ~ 16 - 17 TPM, but 50th percentile is 0 and 75th percentile is ~ 2
pandas.set_option('max_columns', 60)
pandas.set_option('display.width', 120)
#print(gm.describe())
print(gm.loc[TEST_GENES].head())
# Apply some known values
gm['mean'] = gm.apply(lambda row: row.mean(), axis=1)
gm['median'] = gm.apply(lambda row: row.median(), axis=1)
gm['std'] = gm.apply(lambda row: row.std(), axis=1)
#gm['mode'] = gm.apply(lambda row: row.mode(), axis=1)
gm['CoV'] = gm.apply(lambda row: np.inf if row['mean'] == 0 else row['std']/row['mean'], axis=1)
gm['sem'] = gm.apply(lambda row: row.sem(), axis=1)
gm.head()
# high expression # low deviation among tissues
gm_bench = gm[(gm['median'] >= 1) # high expression
& (gm['median'] - 3.5 * gm['std'] > 0)
]
#cross_tissue_genes = sorted(gm_bench['Description'].tolist())
print("Of {} genes, the top {} in stable cross-tissue expression are:".format(len(gm), len(gm_bench)))
#for gene in cross_tissue_genes:
# print('\t{}'.format(gene))
gm_bench.sort_values(by='CoV')
gm.sort_values(by='sem').head(30)
gm.loc[TEST_GENES]
gt = gtpm[gtpm.columns[1:]]
gt.set_index('Description', inplace=True)
org_cols = gt.columns
#gtt = gt.transpose()
#gtt.head()
gt.head()
print(gt.loc['TP53'].describe())
print(gt.loc['ERBB2'].describe())
print(gt.loc['PTEN'].describe())
#gt.loc['PTEN'].skew()
#gt.loc['PTEN'].kurtosis()
gt.loc['PTEN'].quantile(0.9)
gtpm.head()
#gt.T[TEST_GENES[:-1]].iplot(kind='box',)
#gt.T[['TP53', 'PTEN']].plot(kind='box',)
import math
#gene = 'TP53'
gene = 'PTEN'
title = '{}: Samples {}'.format(gene, len(gt.loc[gene]))
#gtt[gene].iplot(kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)
#py.iplot(gt.loc[gene].to_list(), kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)
#gt.loc[gene].iplot(kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)
#gt.loc[gene].plot(kind='hist', title=title, bins=100)
glog2 = gt.loc[gene].apply(math.log2)
glog2.plot(kind='hist', title=title, bins=100)
px.histogram(gt.loc[gene])
#print(ff.create_distplot.__doc__)
gene_list = ['TP53', 'PTEN', 'KRAS', 'ERBB2', 'CDKN2A']
gene_list = ['TP53', 'PTEN', 'KRAS',]
px.histogram([gt.loc[g] for g in gene_list], gene_list)
gene = 'TP53'
data_lists = []
labels = []
for tissue in tissue_dict.keys():
samples = tissue_dict[tissue]['samples']
# print(tissue)
# print(len(samples))
gt_samples = [s for s in samples if s in gt.columns]
if len(gt_samples) > 20:
dt = gt.loc[gene][gt_samples].dropna()
if not dt.empty:
data_lists.append(dt)
labels.append('{}({}) ~ {}'.format(tissue, len(samples), round(dt.median(), 1)))
else:
print('Only {} samples for "{}"'.format(len(gt_samples), tissue))
# add all samples
dt = gt.loc[gene].dropna()
data_lists.append(dt)
labels.append('{} - all ({}) ~ {}'.format(gene, len(dt), round(dt.median(), 1)))
fig_tp53 = ff.create_distplot(data_lists, labels, show_hist=False, show_rug=False, show_curve=True) # curve_type='normal')
fig_tp53.layout.update(title=gene)
fig_tp53.show()
dir(go)
gene='PTEN'
tpm_label = "TPM {} skew: {} kurtosis: {}".format(gene,
round(gt.loc[gene].skew(), 2),
round(gt.loc[gene].kurtosis(), 2))
log2_label = "log2TPM {} skew: {} kurtosis: {}".format(gene,
round(gt.loc[gene].apply(np.log2).skew(), 2),
round(gt.loc[gene].apply(np.log2).kurtosis(), 2))
fig_log2vsNormal = ff.create_distplot([gt.loc[gene].apply(np.log2), gt.loc[gene]],
[log2_label, tpm_label],
bin_size=[0.1, 2],
curve_type='normal',)
fig_log2vsNormal.show()
Data like TPM is lognormal distributed.
log transforms (like fold-change - log 2) are more normally disturbuted.
More irregular curves, like TP53 show more log-normal distributions when clustered into relevent groups, like tissue types.
Otherwise, something like sklearn.preprocessing.RobustScaler looks good for IQR scaling - robust to outliers.
This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).
Gene expressions are correlated / anti-correlated (mutual information/effect) in interaction pathways. 'Activation' of certiain pathways is of interest for cancer and drug effects measures.
PCA with 'whitening'
# Data is now effectively in fold change units
# Use a minum value to prevent div by zero.
#safelog2 = lambda x: numpy.log2(x) if x else numpy.log2(sys.float_info.min)
safelog2 = lambda x: np.log2(x) if x else np.nan
gfc = gt.applymap(safelog2)
print(gfc.info())
print(gfc.head())
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
# Which way are we scaling? Rows or columns?
rs = RobustScaler()
scaler = rs.fit(gfc.T)
gfc_norm = pandas.DataFrame(scaler.transform(gfc.T), index=gfc.columns, columns=gfc.index)
gfc_norm.dropna(inplace=True)
print(gfc_norm.head())
gfc_norm.describe()
gene = 'PTEN'
data_lists = []
labels = []
for tissue in tissue_dict.keys():
samples = tissue_dict[tissue]['samples']
# print(tissue)
# print(len(samples))
gt_samples = [s for s in samples if s in gfc_norm.index]
if len(gt_samples) > 20:
dt = gfc_norm[gene][gt_samples].dropna()
if not dt.empty:
data_lists.append(dt)
labels.append('{}({}) ~ {}'.format(tissue, len(samples), round(dt.median(), 1)))
else:
print('Only {} samples for "{}"'.format(len(gt_samples), tissue))
# add all samples
dt = gfc_norm[gene].dropna()
data_lists.append(dt)
labels.append('{} - all ({}) ~ {}'.format(gene, len(dt), round(dt.median(), 1)))
fig_log2tissue = ff.create_distplot(data_lists, labels, show_hist=False, show_rug=False, show_curve=True) # curve_type='normal')
fig_log2tissue.layout.update(title=gene)
fig_log2tissue.show()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
import sklearn.decomposition
print('sklearn.decomposition')
pprint([f for f in dir(sklearn.decomposition) if str(f)[0] != '_'])
import sklearn.discriminant_analysis
print('sklearn.discriminant_analysis')
pprint([f for f in dir(sklearn.discriminant_analysis) if str(f)[0] != '_'])
print('sklearn.ensemble')
import sklearn.ensemble
pprint([f for f in dir(sklearn.ensemble) if str(f)[0] != '_'])
print('sklearn.metrics')
import sklearn.metrics
pprint([f for f in dir(sklearn.metrics) if str(f)[0] != '_'])
pprint(dir(sklearn.metrics))
# sparse PCA is probably better - interpret which genes are more important.
pca3 = PCA(n_components=3) # Alt opt PCA(0.99) - enough components for 99% of variance
sparse_pca = sklearn.decomposition.SparsePCA(n_components=3)
sparse_comp = sparse_pca.fit_transform(gfc_norm)
components3 = pca3.fit_transform(gfc_norm)
print(pandas.DataFrame(components3).head())
pca3.explained_variance_ratio_
sparse_comp = pandas.DataFrame(sparse_pca.components_, columns=gfc_norm.columns)
print(sparse_comp)
pc1 = sparse_comp.loc[0]
#pc1_genes = [g.index for g in pc1 if g != 0]
print(pc1)
pc1 = pc1.apply(abs)
#print(pc1.sort_values.__doc__)
pc1.sort_values(ascending=False, inplace=True)
print(pc1)
pc1_gene_list =[gene for gene in pc1.index if pc1[gene] != 0]
print(pc1_gene_list)
gtm = pandas.read_csv(gtex_tpm_med_fn, sep='\t', skiprows=2, low_memory=False)
gtm.set_index(['Description', 'Name'], inplace=True)
#tissues = [(col.split(' - ')[0], col.replace(col.split(' - ')[0] + ' - ', '')) for col in gtm.columns]
tissues = [col.split(' - ')[0] for col in gtm.columns]
subtypes = [col.replace(col.split(' - ')[0] + ' - ', '') for col in gtm.columns]
gtm.columns = [tissues, subtypes]
print(gtm.head())
print(gtm.loc['TP53', 'Brain'])
# Makes some reasonable subtables. With droplevel seems to work ok.
gtm.loc[:, 'Brain']
#gtm.loc[:, gtm.columns.get_level_values(1) == 'Cortex']
logger.info("Starting g8 load")
_g8 = pandas.read_csv(
gtex_tpm_fn,
skiprows=2, # First 2 rows are comments of file size - other func here to select patient groups
sep='\t',
header=0,
# usecols=sample_list, # Number of samples limit
)
logger.info("\tfinished g8 load")
g8 = _g8.drop('gene_id', axis=1)
g8 = g8.set_index('Description')
g8.rename(columns=rename_cols, inplace=True)
g8.head()